library(MultiAssayExperiment)
library(kableExtra)
library(jpeg)
library(png)
library(grid)
library(gridExtra)
library(ggplot2)
library(dplyr)
library(tidyr)
library(knitr)
library(DESeq2)
library(pheatmap)
library(edgeR)
library(readxl)
library(fgsea)
library(stringr)
The goal is to compare the new Gide MAE data with the Gide version available on ORCESTRA and assess their quality control (QC) results to ensure consistency in clinical and expression data across both datasets.
New MAE Data: - Total Patient Count: 91 (41 after subsetting to common patients with Orcestra) - Expression Data Dimensions: 61,544 genes across 41 patients (after subsetting) - Expression Data Range: Minimum: -9.965784, Maximum: 16.25238
Orcestra MAE Data: - Patient Count: 41 (same as the final subset of the new MAE) - Expression Data Dimensions: 61,544 genes across 41 patients - Expression Data Range: Minimum: -9.965784, Maximum: 16.25415
The expression data between the new MAE and Orcestra MAE datasets is identical in terms of dimensions and range, ensuring consistency:
Load multiassay .rds file, extract clinical, expression and annotations data; prepare gene expression data for analysis.
# Load MultiAssayExperiment objects
mae <- readRDS("~/BHK lab/ICB/ICB_Gide/output/ICB_Gide.rds")
mae_orcestra <- readRDS("~/BHK lab/ICB/ICB_Gide/output/ICB_GideOrcestra.rds")
# Extract clinical data
clin <- data.frame(colData(mae)) # 91 x 51
clin_orc <- data.frame(colData(mae_orcestra)) # 41 x 38
# Identify patients with "PRE" visit and check if they exist in clin_orcestra
pre_patients <- clin$patient[clin$treatment_timepoint == "PRE"]
missing_patients <- setdiff(pre_patients, clin_orc$patientid)
cat("Number of patients with 'PRE' visit not in clin_orcestra:\n", length(missing_patients), "\n")
## Number of patients with 'PRE' visit not in clin_orcestra:
## 32
cat("Missing patient IDs:\n", missing_patients, "\n")
## Missing patient IDs:
## ERR2208888 ERR2208889 ERR2208890 ERR2208892 ERR2208893 ERR2208894 ERR2208895 ERR2208897 ERR2208898 ERR2208899 ERR2208900 ERR2208901 ERR2208902 ERR2208903 ERR2208904 ERR2208906 ERR2208907 ERR2208909 ERR2208910 ERR2208912 ERR2208914 ERR2208915 ERR2208916 ERR2208918 ERR2208919 ERR2208920 ERR2208921 ERR2208922 ERR2208923 ERR2208924 ERR2208926 ERR3262562
# NOTE: The "$treatment_timepoint" column comes from the data shared with the author.
# From this, we conclude that the Gide MultiAssayExperiment from Orcestra is missing 32 "PRE" samples.
# The next step is to subset the new MAE to include only samples present in clin_orcestra for further comparisons.
# Subset to common patients between clin and clin_orcestra
patients <- clin_orc$patientid
columns <- colnames(clin_orc)
clin_subset <- clin[patients, columns] # Subset to common patients (41 x 38)
# Print assay names for both mae objects
cat("Assay names in mae:\n", names(mae), "\nAssay names in mae_orcestra:\n", names(mae_orcestra), "\n")
## Assay names in mae:
## expr_gene_tpm expr_gene_counts expr_isoform_tpm expr_isoform_counts
## Assay names in mae_orcestra:
## expr_gene_tpm expr_gene_counts expr_isoform_tpm expr_isoform_counts
# Extract expression data for comparison
expr <- assays(mae)[["expr_gene_tpm"]]
expr <- expr[, clin_subset$patientid] # Subset expression data to common patients (61544 x 41)
expr_orc <- assays(mae_orcestra)[["expr_gene_tpm"]]
# Create a table comparing dimensions and ranges of the two expression datasets
comparison_table <- data.frame(
Dataset = c("expr", "expr_orc"),
Dimensions = c(paste(dim(expr), collapse = " x "), paste(dim(expr_orc), collapse = " x ")),
Range_Min = c(min(range(expr)), min(range(expr_orc))),
Range_Max = c(max(range(expr)), max(range(expr_orc)))
)
kable(comparison_table, caption = "Expression comparison between expr and expr_orc")
| Dataset | Dimensions | Range_Min | Range_Max |
|---|---|---|---|
| expr | 61544 x 41 | -9.965784 | 16.25238 |
| expr_orc | 61544 x 41 | -9.965784 | 16.25415 |
# an optional chceking Rnaseq column shared woth both clinical data
# Check the RNA sequencing info consistency between clin_orc and clin_subset
rna_seq_comparison <- data.frame(
Dataset = c("clin_orc", "clin_subset"),
PRE = c(table(clin_orc$RNA.Sequencing)["PRE"], table(clin_subset$RNA.Sequencing)["PRE"]),
`PRE and EDT` = c(table(clin_orc$RNA.Sequencing)["PRE and EDT"], table(clin_subset$RNA.Sequencing)["PRE and EDT"])
)
# Optionally, format the table using kable for a nicer output
kable(rna_seq_comparison, caption = "Comparison of RNA.Sequencing Column between clin_orc and clin_subset")
| Dataset | PRE | PRE.and.EDT |
|---|---|---|
| clin_orc | 32 | 9 |
| clin_subset | 32 | 9 |
clin_subset and
clin_orcclin_subset and clin_orc)
are identical in structure and content for most
columns.age, sex, treatmentid) show no
differences in values or types.histo, stage, and
TMB_raw contain only NA values in both
datasets.Days.to.EDT.Biopsy (numeric in clin_subset,
character in clin_orc).# Add "orcestra_" prefix to the clin_orc columns for clarity
colnames(clin_orc) <- paste0("orcestra_", colnames(clin_orc))
# Merge the clin and clin_orc datasets for comparison
merged_clinical <- merge(clin_subset, clin_orc, by.x = "patientid", by.y = "orcestra_patientid", all = TRUE)
# Display the head of merged_clinical with a customized caption
kable(head(merged_clinical), caption = "Head of Merged Clinical Data (Note: Columns starting with 'orcestra_' (e.g., orcestra_sex) are from the Orcestra dataset, while other columns (e.g., sex) come from the newly curated MultiAssay dataset)") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
| patientid | sex | age | cancer_type | histo | tissueid | treatmentid | stage | response.other.info | recist | response | treatment | dna | rna | survival_time_pfs | event_occurred_pfs | survival_time_os | event_occurred_os | survival_unit | survival_type | TMB_raw | nsTMB_raw | indel_TMB_raw | indel_nsTMB_raw | TMB_perMb | nsTMB_perMb | indel_TMB_perMb | indel_nsTMB_perMb | CIN | CNA_tot | AMP | DEL | Site.of.PRE.Biopsy | Days.to.EDT.Biopsy | Site.of.EDT.Biopsy | RNA.Sequencing | Immunofluorescence.Panel.1 | Immunofluorescence.Panel.2 | orcestra_sex | orcestra_age | orcestra_cancer_type | orcestra_histo | orcestra_tissueid | orcestra_treatmentid | orcestra_stage | orcestra_response.other.info | orcestra_recist | orcestra_response | orcestra_treatment | orcestra_dna | orcestra_rna | orcestra_survival_time_pfs | orcestra_event_occurred_pfs | orcestra_survival_time_os | orcestra_event_occurred_os | orcestra_survival_unit | orcestra_survival_type | orcestra_TMB_raw | orcestra_nsTMB_raw | orcestra_indel_TMB_raw | orcestra_indel_nsTMB_raw | orcestra_TMB_perMb | orcestra_nsTMB_perMb | orcestra_indel_TMB_perMb | orcestra_indel_nsTMB_perMb | orcestra_CIN | orcestra_CNA_tot | orcestra_AMP | orcestra_DEL | orcestra_Site.of.PRE.Biopsy | orcestra_Days.to.EDT.Biopsy | orcestra_Site.of.EDT.Biopsy | orcestra_RNA.Sequencing | orcestra_Immunofluorescence.Panel.1 | orcestra_Immunofluorescence.Panel.2 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ERR2208929 | M | 90 | Melanoma | NA | Skin | Pembrolizumab | NA | NA | PD | NR | PD-1/PD-L1 | NA | rnaseq | 2.533333 | 1 | 18.366667 | 0 | month | PFS/OS | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | SQ | 7 | SQ | PRE and EDT | PRE | PRE | M | 90 | Melanoma | NA | Skin | NA | NA | PD | NR | PD-1/PD-L1 | NA | tpm | 2.533333 | 1 | 18.366667 | 0 | month | PFS/OS | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | SQ | 7 | SQ | PRE and EDT | PRE | PRE | |
| ERR2208930 | M | 66 | Melanoma | NA | Skin | Pembrolizumab | NA | NA | PD | NR | PD-1/PD-L1 | NA | rnaseq | 2.600000 | 1 | 5.533333 | 1 | month | PFS/OS | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | SQ | NA | NA | PRE | NA | NA | M | 66 | Melanoma | NA | Skin | NA | NA | PD | NR | PD-1/PD-L1 | NA | tpm | 2.600000 | 1 | 5.533333 | 1 | month | PFS/OS | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | SQ |
|
|
PRE |
|
|
|
| ERR2208931 | M | 52 | Melanoma | NA | Skin | Pembrolizumab | NA | NA | SD | NR | PD-1/PD-L1 | NA | rnaseq | 2.666667 | 1 | 6.633333 | 1 | month | PFS/OS | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | SQ | 14 | SQ | PRE | PRE and EDT | PRE and EDT | M | 52 | Melanoma | NA | Skin | NA | NA | SD | NR | PD-1/PD-L1 | NA | tpm | 2.666667 | 1 | 6.633333 | 1 | month | PFS/OS | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | SQ | 14 | SQ | PRE | PRE and EDT | PRE and EDT | |
| ERR2208933 | M | 69 | Melanoma | NA | Skin | Pembrolizumab | NA | NA | PD | NR | PD-1/PD-L1 | NA | rnaseq | 2.733333 | 1 | 3.200000 | 1 | month | PFS/OS | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | SQ | 6 | SQ | PRE and EDT | PRE and EDT | PRE and EDT | M | 69 | Melanoma | NA | Skin | NA | NA | PD | NR | PD-1/PD-L1 | NA | tpm | 2.733333 | 1 | 3.200000 | 1 | month | PFS/OS | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | SQ | 6 | SQ | PRE and EDT | PRE and EDT | PRE and EDT | |
| ERR2208934 | M | 58 | Melanoma | NA | Skin | Pembrolizumab | NA | NA | PD | NR | PD-1/PD-L1 | NA | rnaseq | 2.766667 | 1 | 33.100000 | 1 | month | PFS/OS | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | Mucosa | NA | NA | PRE | PRE | PRE | M | 58 | Melanoma | NA | Skin | NA | NA | PD | NR | PD-1/PD-L1 | NA | tpm | 2.766667 | 1 | 33.100000 | 1 | month | PFS/OS | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | Mucosa |
|
|
PRE | PRE | PRE | |
| ERR2208935 | F | 76 | Melanoma | NA | Skin | Pembrolizumab | NA | NA | PD | NR | PD-1/PD-L1 | NA | rnaseq | 2.800000 | 1 | 10.000000 | 1 | month | PFS/OS | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | SQ | NA | NA | PRE | NA | NA | F | 76 | Melanoma | NA | Skin | NA | NA | PD | NR | PD-1/PD-L1 | NA | tpm | 2.800000 | 1 | 10.000000 | 1 | month | PFS/OS | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | SQ |
|
|
PRE |
|
|
# Next step is to compare each columns together
# Initialize an empty df to store result
comparison_results <- data.frame(Column = character(), Clin_Subset_Summary = character(), Clin_Orc_Summary = character(), stringsAsFactors = FALSE)
# Loop through columns in clin_subset and clin_orc and compare them directly
for (col in colnames(clin_subset)) {
orcestra_col <- paste0("orcestra_", col)
if (orcestra_col %in% colnames(clin_orc)) {
# Capture summary of clin_subset
clin_subset_summary <- capture.output(summary(clin_subset[[col]]))
clin_orc_summary <- capture.output(summary(clin_orc[[orcestra_col]]))
# Combine summaries into one data frame
comparison_results <- rbind(comparison_results,
data.frame(Column = col,
Clin_Subset_Summary = paste(clin_subset_summary, collapse = " "),
Clin_Orc_Summary = paste(clin_orc_summary, collapse = " "),
stringsAsFactors = FALSE))
}
}
# Display the comparison table using kable
kable(comparison_results, caption = "Comparison of Column Summaries between clin_subset and clin_orc") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
scroll_box(width = "100%", height = "500px")
| Column | Clin_Subset_Summary | Clin_Orc_Summary |
|---|---|---|
| patientid | Length Class Mode 41 character character | Length Class Mode 41 character character |
| sex | Length Class Mode 41 character character | Length Class Mode 41 character character |
| age | Min. 1st Qu. Median Mean 3rd Qu. Max. 37.00 55.00 66.00 64.73 76.00 90.00 | Min. 1st Qu. Median Mean 3rd Qu. Max. 37.00 55.00 66.00 64.73 76.00 90.00 |
| cancer_type | Length Class Mode 41 character character | Length Class Mode 41 character character |
| histo | Mode NA’s logical 41 | Mode NA’s logical 41 |
| tissueid | Length Class Mode 41 character character | Length Class Mode 41 character character |
| treatmentid | Length Class Mode 41 character character | Length Class Mode 41 character character |
| stage | Mode NA’s logical 41 | Mode NA’s logical 41 |
| response.other.info | Mode NA’s logical 41 | Mode NA’s logical 41 |
| recist | Length Class Mode 41 character character | Length Class Mode 41 character character |
| response | Length Class Mode 41 character character | Length Class Mode 41 character character |
| treatment | Length Class Mode 41 character character | Length Class Mode 41 character character |
| dna | Mode NA’s logical 41 | Mode NA’s logical 41 |
| rna | Length Class Mode 41 character character | Length Class Mode 41 character character |
| survival_time_pfs | Min. 1st Qu. Median Mean 3rd Qu. Max. 0.4333 2.6667 9.0333 17.0374 29.7000 53.4333 | Min. 1st Qu. Median Mean 3rd Qu. Max. 0.4333 2.6667 9.0333 17.0374 29.7000 53.4333 |
| event_occurred_pfs | Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0000 0.0000 1.0000 0.7073 1.0000 1.0000 | Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0000 0.0000 1.0000 0.7073 1.0000 1.0000 |
| survival_time_os | Min. 1st Qu. Median Mean 3rd Qu. Max. 0.7333 5.6333 20.2333 22.6585 36.1667 53.4333 | Min. 1st Qu. Median Mean 3rd Qu. Max. 0.7333 5.6333 20.2333 22.6585 36.1667 53.4333 |
| event_occurred_os | Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0000 0.0000 1.0000 0.5854 1.0000 1.0000 | Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0000 0.0000 1.0000 0.5854 1.0000 1.0000 |
| survival_unit | Length Class Mode 41 character character | Length Class Mode 41 character character |
| survival_type | Length Class Mode 41 character character | Length Class Mode 41 character character |
| TMB_raw | Mode NA’s logical 41 | Mode NA’s logical 41 |
| nsTMB_raw | Mode NA’s logical 41 | Mode NA’s logical 41 |
| indel_TMB_raw | Mode NA’s logical 41 | Mode NA’s logical 41 |
| indel_nsTMB_raw | Mode NA’s logical 41 | Mode NA’s logical 41 |
| TMB_perMb | Mode NA’s logical 41 | Mode NA’s logical 41 |
| nsTMB_perMb | Mode NA’s logical 41 | Mode NA’s logical 41 |
| indel_TMB_perMb | Mode NA’s logical 41 | Mode NA’s logical 41 |
| indel_nsTMB_perMb | Mode NA’s logical 41 | Mode NA’s logical 41 |
| CIN | Mode NA’s logical 41 | Mode NA’s logical 41 |
| CNA_tot | Mode NA’s logical 41 | Mode NA’s logical 41 |
| AMP | Mode NA’s logical 41 | Mode NA’s logical 41 |
| DEL | Mode NA’s logical 41 | Mode NA’s logical 41 |
| Site.of.PRE.Biopsy | Length Class Mode 41 character character | Length Class Mode 41 character character |
| Days.to.EDT.Biopsy | Min. 1st Qu. Median Mean 3rd Qu. Max. NA’s 2.0 6.5 9.0 15.2 14.5 68.0 26 | Length Class Mode 41 character character |
| Site.of.EDT.Biopsy | Length Class Mode 41 character character | Length Class Mode 41 character character |
| RNA.Sequencing | Length Class Mode 41 character character | Length Class Mode 41 character character |
| Immunofluorescence.Panel.1 | Length Class Mode 41 character character | Length Class Mode 41 character character |
| Immunofluorescence.Panel.2 | Length Class Mode 41 character character | Length Class Mode 41 character character |
To generate Table 1 and Figure 2 (A and B) from the paper PubMed ID 30753825.
Compare our new clinical data (91 patients) and clin_orc
data (41 patients) to the published study (120 patients) for key
characteristics and outcomes in Anti-PD-1 Monotherapy and Combined
Anti-PD-1 with Anti-CTLA-4 Immunotherapy cohorts.
clin_orc shows no CR in monotherapy.Our dataset aligns closely with the published study regarding
clinical characteristics and outcomes, with minor differences expected
due to cohort sizes. The newer dataset is likely more accurate than the
clin_orc data.
# Revert back the "orcestra_" prefix from column names
colnames(clin_orc) <- gsub("^orcestra_", "", colnames(clin_orc))
# Function to process clinical data and generate a summary table with dynamic n values
generate_clinical_summary <- function(clin_data) {
# Define responder and non-responder groups based on the criteria
clin_data$response_paper <- ifelse(
(clin_data$recist %in% c("CR", "PR") | (clin_data$recist == "SD" & clin_data$survival_time_pfs > 6)),
"Responder",
ifelse(clin_data$recist == "PD" | (clin_data$recist == "SD" & clin_data$survival_time_pfs <= 6),
"Non-responder",
NA)
)
# Subset data to mono and combo
monotherapy <- clin_data %>% filter(treatment == "PD-1/PD-L1")
combo_therapy <- clin_data %>% filter(treatment == "IO+combo")
# Count the number of responders and non-responders for dynamic n values
monotherapy_n <- monotherapy %>% group_by(response_paper) %>% summarize(n = n())
combo_therapy_n <- combo_therapy %>% group_by(response_paper) %>% summarize(n = n())
# Generate detailed summary statistics
generate_detailed_summary <- function(data) {
data %>%
group_by(response_paper) %>%
summarize(
`Age (years), median` = as.character(round(median(age, na.rm = TRUE), 1)),
Male = paste0(sum(sex == "M", na.rm = TRUE), " (", round(sum(sex == "M", na.rm = TRUE) / n() * 100, 1), "%)"),
Female = paste0(sum(sex == "F", na.rm = TRUE), " (", round(sum(sex == "F", na.rm = TRUE) / n() * 100, 1), "%)"),
`Nivolumab n (%)` = paste0(sum(treatmentid == "Nivolumab", na.rm = TRUE), " (", round(sum(treatmentid == "Nivolumab", na.rm = TRUE) / n() * 100, 1), "%)"),
`Pembrolizumab n (%)` = paste0(sum(treatmentid == "Pembrolizumab", na.rm = TRUE), " (", round(sum(treatmentid == "Pembrolizumab", na.rm = TRUE) / n() * 100, 1), "%)"),
CR = as.character(sum(recist == "CR", na.rm = TRUE)),
PR = as.character(sum(recist == "PR", na.rm = TRUE)),
SD = as.character(sum(recist == "SD", na.rm = TRUE)),
PD = as.character(sum(recist == "PD", na.rm = TRUE)),
`Median PFS (months)` = as.character(round(median(survival_time_pfs, na.rm = TRUE), 1)),
`12-month PFS (%)` = as.character(round(mean(survival_time_pfs >= 12, na.rm = TRUE) * 100, 1)),
`Median OS (months)` = as.character(round(median(survival_time_os, na.rm = TRUE), 1)),
`12-month OS (%)` = as.character(round(mean(survival_time_os >= 12, na.rm = TRUE) * 100, 1))
)
}
# Generate summaries
monotherapy_summary <- generate_detailed_summary(monotherapy) %>% mutate(Therapy = "Monotherapy")
combo_therapy_summary <- generate_detailed_summary(combo_therapy) %>% mutate(Therapy = "Combination Therapy")
# Combine and reshape the summary
combined_summary <- bind_rows(monotherapy_summary, combo_therapy_summary) %>%
pivot_longer(cols = -c(Therapy, response_paper), names_to = "Characteristic", values_to = "Value") %>%
pivot_wider(names_from = c(Therapy, response_paper), values_from = Value)
# Get the dynamic n values for monotherapy and combo therapy
monotherapy_non_responder_n <- monotherapy_n %>% filter(response_paper == "Non-responder") %>% pull(n)
monotherapy_responder_n <- monotherapy_n %>% filter(response_paper == "Responder") %>% pull(n)
combo_non_responder_n <- combo_therapy_n %>% filter(response_paper == "Non-responder") %>% pull(n)
combo_responder_n <- combo_therapy_n %>% filter(response_paper == "Responder") %>% pull(n)
# Add number of patients (n) dynamically to column headers
colnames(combined_summary) <- c("Characteristic",
paste0("Monotherapy Non-responder (n=", monotherapy_non_responder_n, ")"),
paste0("Monotherapy Responder (n=", monotherapy_responder_n, ")"),
paste0("Combination Therapy Non-responder (n=", combo_non_responder_n, ")"),
paste0("Combination Therapy Responder (n=", combo_responder_n, ")"))
# Define row names
rownames_final <- c("Age (years), median", "Male", "Female",
"Nivolumab n (%)", "Pembrolizumab n (%)", "CR", "PR",
"SD", "PD", "Median PFS (months)", "12-month PFS (%)",
"Median OS (months)", "12-month OS (%)")
rownames(combined_summary) <- rownames_final
return(combined_summary)
}
# Usage example for clin and clin_orc
summary_clin <- generate_clinical_summary(clin)
summary_clin_orcestra <- generate_clinical_summary(clin_orc)
# Display New Gide MAE summary
kable(summary_clin, caption = "New Gide MAE: Clinicopathologic Characteristics", align = "l") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
| Characteristic | Monotherapy Non-responder (n=23) | Monotherapy Responder (n=27) | Combination Therapy Non-responder (n=11) | Combination Therapy Responder (n=30) |
|---|---|---|---|---|
| Age (years), median | 67 | 62 | 51 | 65 |
| Male | 17 (73.9%) | 16 (59.3%) | 7 (63.6%) | 20 (66.7%) |
| Female | 6 (26.1%) | 11 (40.7%) | 4 (36.4%) | 10 (33.3%) |
| Nivolumab n (%) | 5 (21.7%) | 6 (22.2%) | 0 (0%) | 0 (0%) |
| Pembrolizumab n (%) | 18 (78.3%) | 21 (77.8%) | 0 (0%) | 0 (0%) |
| CR | 0 | 4 | 0 | 13 |
| PR | 0 | 19 | 0 | 13 |
| SD | 3 | 4 | 2 | 4 |
| PD | 20 | 0 | 9 | 0 |
| Median PFS (months) | 2.6 | 29 | 2.7 | 20.1 |
| 12-month PFS (%) | 0 | 74.1 | 0 | 80 |
| Median OS (months) | 5.5 | 35.8 | 17.7 | 20.8 |
| 12-month OS (%) | 21.7 | 96.3 | 54.5 | 86.7 |
# Display ORCESTRA Version summary
kable(summary_clin_orcestra, caption = "ORCESTRA Version: Clinicopathologic Characteristics", align = "l") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F)
| Characteristic | Monotherapy Non-responder (n=19) | Monotherapy Responder (n=22) |
|---|---|---|
| Age (years), median | 66 | 66 |
| Male | 14 (73.7%) | 12 (54.5%) |
| Female | 5 (26.3%) | 10 (45.5%) |
| Nivolumab n (%) | 0 (0%) | 0 (0%) |
| Pembrolizumab n (%) | 0 (0%) | 0 (0%) |
| CR | 0 | 4 |
| PR | 0 | 15 |
| SD | 3 | 3 |
| PD | 16 | 0 |
| Median PFS (months) | 2.6 | 29.4 |
| 12-month PFS (%) | 0 | 77.3 |
| Median OS (months) | 5.5 | 36 |
| 12-month OS (%) | 21.1 | 95.5 |
# Read the PNG image
img <- png::readPNG("~/BHK lab/ICB/ICB_Gide/files/paper_table1.png")
# Convert the image to a grob
img_grob <- rasterGrob(img, interpolate = TRUE)
# Convert the summary table to a grob
summary_clin_grob <- tableGrob(summary_clin, theme = ttheme_default(
core = list(fg_params = list(cex = 0.8)),
colhead = list(fg_params = list(cex = 0.8)),
rowhead = list(fg_params = list(cex = 0.8))
))
# Arrange the summary table and the image side by side
grid.arrange(summary_clin_grob, img_grob, ncol = 2, widths = c(0.6, 0.4))
This study delves into the immunotranscriptomic changes experienced by patients undergoing Anti-PD-1 monotherapy and combination therapy. We aim to delineate the gene expression profiles that distinguish responders from non-responders.
The original paper evaluated 47 PRE samples (35 responders, 12 non-responders) and 15 EDT samples (11 responders, 4 non-responders). Comparatively, our dataset contains 41 PRE and 9 POST samples.
DESeq() function for
consistent read count normalization.cpm() function to
ensure comparable visualization of expression data.Our study also aims to examine DEGs for clinical and count data
across different versions of the dataset: - Orcestra
Version: clin_orc and count_orcestra.
- New Version: clin_subset and
count.
The goal was to ensure the identified DEGs were consistent across both dataset versions. Remarkably, both analyses identified an identical number of significant DEGs, totaling 139 in each, underscoring the robustness and reproducibility of our gene signatures across diverse data structures and analytical approaches.
Notable genes including CFTR, USH2A, HOXA9, LAMB4, MYH7, and CD274 were identified as DEGs. Specifically: - LAMB4: Upregulated in non-responders, highlighting its role in immunosuppressive pathways. - CD274 (PD-L1): Known for its pivotal role in immune checkpoint pathways, associated with modulation of T cell activity.
This detailed comparison and consistent findings across dataset versions reinforce the reliability of our analytical methods and the significance of the identified DEGs in clinical contexts.
# Define responder and non-responder groups based on the criteria
assign_response_paper <- function(clin_data) {
clin_data$response_paper <- ifelse(
clin_data$recist %in% c("CR", "PR") | (clin_data$recist == "SD" & clin_data$survival_time_pfs > 6),
"Responder",
ifelse(clin_data$recist == "PD" | (clin_data$recist == "SD" & clin_data$survival_time_pfs <= 6),
"Non-responder",
NA)
)
return(clin_data)
}
# Apply the function to both clin_subset and clin_orc
clin_subset <- assign_response_paper(clin_subset)
clin_orc <- assign_response_paper(clin_orc)
# 1. Orcestra count and annotation
expr_gene_counts_orc <- assays(mae_orcestra)[["expr_gene_counts"]] # 61544 41
annot_orc <- data.frame(rowData(mae_orcestra@ExperimentList$expr_gene_counts))
# 2. Newly curated count and annotation
expr_gene_counts <- assays(mae)[["expr_gene_counts"]]
expr_gene_counts <- expr_gene_counts[, clin_orc$patientid] # 61544 X 41 dim
annot <- data.frame(rowData(mae@ExperimentList$expr_gene_counts))
# Remove duplicate genes and set as rownames
annot <- annot[!duplicated(annot$gene_name), ]
annot <- annot[, c("gene_id", "gene_name")]
analyze_gene_expression <- function(expr_gene_counts, annot, clin_subset, treatment_col = "RNA.Sequencing", treatment_pre = "PRE", response_col = "response_paper") {
# Remove duplicate genes and set as rownames
annot <- annot[!duplicated(annot$gene_name), ]
annot <- annot[, c("gene_id", "gene_name")]
# Match gene IDs and gene names, and remove rows without a gene name
expr_gene_counts$gene_id <- rownames(expr_gene_counts)
expr_gene_counts <- merge(expr_gene_counts, annot, by = "gene_id")
expr_gene_counts <- expr_gene_counts[!is.na(expr_gene_counts$gene_name), ]
rownames(expr_gene_counts) <- expr_gene_counts$gene_name
expr_gene_counts$gene_id <- NULL
expr_gene_counts$gene_name <- NULL
# Revert log-transformed data (if necessary)
expr_gene_counts <- 2^expr_gene_counts - 1
# Define sample subsets for PRE treatments based on treatment_col
pre_samples <- intersect(clin_subset$patientid[clin_subset[[treatment_col]] == treatment_pre], colnames(expr_gene_counts))
# Subset expression data for PRE samples
expr_pre <- expr_gene_counts[, pre_samples]
# Subset clin_subset to matching samples for PRE
clin_pre <- clin_subset[clin_subset$patientid %in% pre_samples, , drop = FALSE]
# Create DESeq2 dataset objects and perform normalization and differential expression analysis
dds_pre <- DESeqDataSetFromMatrix(countData = round(expr_pre), colData = clin_pre, design = as.formula(paste("~", response_col)))
dds_pre <- DESeq(dds_pre)
res_pre <- results(dds_pre, contrast = c(response_col, "Responder", "Non-responder"))
# Filter significant results
sig_res_pre <- res_pre[!is.na(res_pre$padj) & res_pre$padj < 0.05, ]
# Compute CPM and log-transform for visualization
cpm_pre <- cpm(DGEList(counts = expr_pre), log = TRUE, prior.count = 1)
# Subset DEGs
cpm_pre_degs <- cpm_pre[rownames(sig_res_pre), ]
# Generate heatmap
pheatmap_pre <- pheatmap(cpm_pre_degs, cluster_rows = TRUE, cluster_cols = TRUE, show_rownames = TRUE, show_colnames = TRUE, color = colorRampPalette(c("blue", "white", "red"))(50), silent = TRUE, main = "PRE Treatment Heatmap (Monotherapy)")
# Return results and heatmaps
list(
sig_res_pre = sig_res_pre,
pheatmap_pre = pheatmap_pre
)
}
#1. Analyze expr_gene_counts
results <- analyze_gene_expression(expr_gene_counts, annot, clin_subset)
sig_res_pre <- results$sig_res_pre
pheatmap_pre <- results$pheatmap_pre
deg_count <- length(rownames(sig_res_pre))
# 2.Analyze expr_gene_counts_orc
results_orc <- analyze_gene_expression(expr_gene_counts_orc, annot_orc, clin_orc)
sig_res_pre_orc <- results_orc$sig_res_pre
pheatmap_pre_orc <- results_orc$pheatmap_pre
deg_count_orc <- length(rownames(sig_res_pre_orc))
deg_table <- data.frame(
Dataset = c("expr_gene_counts", "expr_gene_counts_orc"),
Number_of_DEGs = c(deg_count, deg_count_orc)
)
kable(deg_table, caption = "Number of DEGs in Each Dataset")
| Dataset | Number_of_DEGs |
|---|---|
| expr_gene_counts | 139 |
| expr_gene_counts_orc | 139 |
grob1 <- pheatmap_pre$gtable
grob2 <- pheatmap_pre_orc$gtable
# Use grid.arrange to plot them side by side
grid.arrange(grob1, grob2, ncol = 2)
# Compare DEGs with a provided paper
DEGS_result_paper <- read_excel("~/BHK lab/ICB/ICB_Gide/files/1-s2.0-S1535610819300376-mmc4.xlsx")
colnames(DEGS_result_paper ) <- DEGS_result_paper [2, ]
DEGS_result_paper <- DEGS_result_paper [-c(1:2), ]
colnames(DEGS_result_paper ) <- str_replace_all(colnames(DEGS_result_paper ), '\\W', '.')
DEGS_paper <- DEGS_result_paper$Gene
if (all(rownames(sig_res_pre_orc) == rownames(sig_res_pre))) {
print("All row names match")
} else {
print("Row names do not match")
}
## [1] "All row names match"
# Find the intersection of DEGs from the paper and your results
intersect_genes <- intersect(DEGS_paper, rownames(sig_res_pre))
print(paste("Common DEGs:", paste(intersect_genes, collapse = ", ")))
## [1] "Common DEGs: ZNF683, ACOXL, BFSP2, MAGI1, XIRP1, GABRA2, GRIA2, NDST3, PTCRA, LAMB4, IDO1, CD274, ITIH5, BATF2, SMTNL1, TRAJ1, GDPD2"
Despite both curated DEG sets being identical, demonstrating consistency with the patient data, only 139 DEGs were initially identified, whereas the paper mentions over 300. We employed two approaches for further analysis:
In this approach, we reanalyzed the data using the full clinical dataset to determine if including all available patient data influences DEG identification.
In this approach, we used the new “treatment_timepoint” column
(extracted from the file 1-s2.0-S1535610819300376-mmc4.xlsx
shared by the author) instead of the “RNA.Sequencing” column. This
allowed us to distinguish pre- and post-treatment conditions more
accurately.
As a result, the newer MAE with the “treatment_timepoint” column provides more accurate results by clearly distinguishing pre- and post-treatment conditions, leading to better alignment with the paper’s findings.
# Newly curated count and annotation
expr_gene_counts <- assays(mae)[["expr_gene_counts"]] # dim 61544 x 91
annot <- data.frame(rowData(mae@ExperimentList$expr_gene_counts))
clin <- data.frame(colData(mae))
clin <- assign_response_paper(clin)
# 1. Analyze expr_gene_counts with complete clinical data for 91 patients and retain RNA.Sequencing
results_complete <- analyze_gene_expression(expr_gene_counts, annot, clin)
sig_res_pre1 <- results_complete$sig_res_pre
pheatmap_pre1 <- results_complete$pheatmap_pre
deg_count1 <- length(rownames(sig_res_pre1)) # This is 375 DEGs
# Find intersecting DEGs with DEGS_paper
intersect_genes1 <- intersect(DEGS_paper, rownames(sig_res_pre1)) # 78 intersecting genes
print(paste0("Total DEGs: ", deg_count1,
"; Common DEGs: ", length(intersect_genes1),
"; Genes: ", paste(intersect_genes1, collapse = ", ")))
## [1] "Total DEGs: 378; Common DEGs: 79; Genes: C1orf74, CD244, FCRL6, GBP1, GBP4, GFI1, IFI44L, LAX1, LCK, SELL, SLAMF1, SLAMF7, ZNF683, ACOXL, CD8B, IFIH1, STAT1, ZAP70, BFSP2, MAGI1, XIRP1, C4orf50, CXCL10, CXCL11, CXCL9, GRIA2, JAKMIP1, NDST3, RNF212, IRF1, IPCEF1, PSMB9, PTCRA, TAP1, UBD, LAMB4, TRGC2, TRGV10, IDO1, CD274, PRF1, BATF2, CRTAM, PTPRCAP, SMTNL1, UBE2L6, CNTN1, LAG3, OAS2, GCH1, TRAJ1, TRAJ13, TRAJ14, TRAJ16, TRAJ17, TRAJ27, TRAJ5, TRAJ52, HAPLN3, PATL2, NLRC3, NLRC5, SYNGR3, CCL4, CCL5, TBX21, WNT3, XAF1, IL12RB1, LILRA4, NCR1, NLRP7, SIRPG, SLA2, ZBP1, UBASH3A, APOL2, APOL6, GDPD2"
# Filter patients that are in both clinical data and expression data
patients <- intersect(clin$patientid, colnames(expr_gene_counts))
clin <- clin[clin$patientid %in% patients, ] # Filter clinical data to matching patients
expr_gene_counts <- expr_gene_counts[, patients] # Subset expression data to matching patients
# 2. Run analysis for the subset of patients
results_complete2 <- analyze_gene_expression(expr_gene_counts, annot, clin, treatment_col = "treatment_timepoint", treatment_pre = "PRE")
sig_res_pre2 <- results_complete2$sig_res_pre
pheatmap_pre2 <- results_complete2$pheatmap_pre
deg_count2 <- length(rownames(sig_res_pre2)) # DEG 326
# Find intersecting DEGs with DEGS_paper for second run
intersect_genes2 <- intersect(DEGS_paper, rownames(sig_res_pre2)) #46 same genes
# Print DEG counts and intersections
print(paste0("Total DEGs: ", deg_count2,
"; Common DEGs: ", length(intersect_genes2),
"; Genes: ", paste(intersect_genes2, collapse = ", ")))
## [1] "Total DEGs: 754; Common DEGs: 172; Genes: C1orf74, CD2, CD244, CD48, FCRL6, GBP1, GBP4, GBP5, GFI1, GRIK3, IFI44L, LAX1, LCK, PIK3CD-AS1, PYHIN1, SELL, SLAMF1, SLAMF6, SLAMF7, TNFRSF9, TRAF3IP3, ZNF683, ACOXL, CD8A, CD8B, CMPK2, DUSP2, IFIH1, IL18RAP, SP100, STAT1, STAT4, ZAP70, BFSP2, GPR171, MAGI1, PARP14, TIGIT, XIRP1, C4orf50, CD38, CXCL10, CXCL11, CXCL13, CXCL9, DDX60, DTHD1, GRIA2, HERC6, JAKMIP1, LAP3, NDST3, RNF212, TNIP3, GZMA, IRF1, ITK, BTN3A1, BTN3A2, BTN3A3, HLA-A, HLA-F, IPCEF1, PSMB8, PSMB8-AS1, PSMB9, PTCRA, SAMD3, TAGAP, TAP1, TAP2, TAPBP, THEMIS, TNF, TNFAIP3, UBD, LAMB4, SAMD9L, TRBC2, TRBV5-6, TRGC2, TRGV10, IDO1, AKNA, CD274, DBH, SEMA4D, IFIT3, PRF1, PRKCQ, BATF2, CD3E, CD6, CRTAM, LUZP2, PTPRCAP, SMTNL1, TRIM21, UBE2L6, ARHGAP9, CLEC4E, CLEC6A, CNTN1, DHH, KLRD1, LAG3, LYZ, OAS2, EPSTI1, GCH1, TRAC, TRAJ1, TRAJ10, TRAJ13, TRAJ14, TRAJ16, TRAJ17, TRAJ18, TRAJ27, TRAJ36, TRAJ38, TRAJ42, TRAJ44, TRAJ45, TRAJ5, TRAJ50, TRAJ52, TRAV8-3, HAPLN3, PATL2, TRIM69, CORO1A, ITGAD, ITGAL, NLRC3, NLRC5, PSMB10, SYNGR3, CCL3, CCL4, CCL5, CD7, SLFN12L, TBX21, TMC8, WNT3, XAF1, BST2, DENND1C, HSH2D, IL12RB1, IL4I1, LILRA4, MAP4K1, NCR1, NLRP7, RASAL3, CST7, SIRPG, SLA2, ZBP1, ZNF831, ZNFX1, MX1, UBASH3A, APOL2, APOL6, IL2RB, GDPD2, GPR174, IL2RG, P2RY8"
# Extract grobs from pheatmap objects
grob1 <- pheatmap_pre1$gtable
grob2 <- pheatmap_pre2$gtable
# Display the first heatmap with a caption
grid.arrange(grob1, ncol = 1, top = textGrob("Heatmap 1: Monotherapy", gp = gpar(fontsize = 14, fontface = "bold")))
# Display the second heatmap with a caption
grid.arrange(grob2, ncol = 1, top = textGrob("Heatmap 2: Combination Therapy", gp = gpar(fontsize = 14, fontface = "bold")))
This analysis aims to identify and compare enriched KEGG signaling pathways in pre-treatment and early during treatment (EDT) samples from anti-PD-1 monotherapy. The focus is on statistically significant pathways (nominal p < 0.05) and comparing them with known immune-related pathways from the literature.
KEGG pathways were sourced from the file
c2.cp.kegg.v7.4.symbols.gmt.
fgsea package to determine pathway
enrichment.Common Pathways: KEGG_CHEMOKINE_SIGNALING_PATHWAY,
KEGG_NATURAL_KILLER_CELL_MEDIATED_CYTOTOXICITY were identified in our
DEG analysis using the complete clinical data, with the new
treatment_timepoint column.
# Load KEGG pathways for MEDICUS
pathwaysKEG <- gmtPathways("~/BHK lab/ICB/ICB_Gide/files/c2.cp.kegg.v7.4.symbols.gmt")
# 1. Perform GSEA for sig_res_pre2 (Monotherapy - Pre-treatment and EDT)
# Rank genes by -log10 of p-value
ranked_genes1 <- -log10(sig_res_pre2$pvalue)
names(ranked_genes1) <- rownames(sig_res_pre2)
# Perform fgsea for sig_res_pre2 (Monotherapy)
fgsea_results1 <- fgsea(pathways = pathwaysKEG, stats = ranked_genes1)
# Tidy and filter results for fgsea (Significance at nominal p < 0.05)
fgsea_tidy1 <- fgsea_results1 %>%
as_tibble() %>%
arrange(desc(NES)) %>%
mutate(Significance = ifelse(padj <= 0.05, "Significant", "Not Significant"))
fgsea_tidy1_top <- head(fgsea_tidy1, 10)
kable(fgsea_tidy1_top, caption = "Top 10 KEGG Pathway Enrichment Results for Monotherapy (Pre-treatment & EDT)")
| pathway | pval | padj | log2err | ES | NES | size | leadingEdge | Significance |
|---|---|---|---|---|---|---|---|---|
| KEGG_TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY | 0.0052315 | 0.4490413 | 0.4070179 | 0.6388218 | 1.892928 | 8 | CXCL10, …. | Not Significant |
| KEGG_NATURAL_KILLER_CELL_MEDIATED_CYTOTOXICITY | 0.0068474 | 0.4490413 | 0.4070179 | 0.5030355 | 1.858223 | 15 | SHC3, NC…. | Not Significant |
| KEGG_CHEMOKINE_SIGNALING_PATHWAY | 0.0167553 | 0.4490413 | 0.3524879 | 0.4516649 | 1.715945 | 16 | SHC3, CX…. | Not Significant |
| KEGG_VIRAL_MYOCARDITIS | 0.0254927 | 0.5693372 | 0.3524879 | 0.5263657 | 1.623955 | 9 | MYH7, MY…. | Not Significant |
| KEGG_CYTOSOLIC_DNA_SENSING_PATHWAY | 0.0867710 | 0.8474092 | 0.1797823 | 0.5397852 | 1.435343 | 6 | CXCL10, …. | Not Significant |
| KEGG_LONG_TERM_DEPRESSION | 0.0699659 | 0.8474092 | 0.2220560 | 0.7898936 | 1.410733 | 2 | GRID2, GRIA2 | Not Significant |
| KEGG_TRYPTOPHAN_METABOLISM | 0.1074020 | 0.8474092 | 0.1619789 | 0.5602341 | 1.405966 | 5 | IDO1, WARS1 | Not Significant |
| KEGG_NOTCH_SIGNALING_PATHWAY | 0.0750853 | 0.8474092 | 0.2139279 | 0.7859043 | 1.403608 | 2 | DTX3L, PTCRA | Not Significant |
| KEGG_FOCAL_ADHESION | 0.0990629 | 0.8474092 | 0.1619789 | 0.4705017 | 1.394169 | 8 | SHC3, RE…. | Not Significant |
| KEGG_PATHOGENIC_ESCHERICHIA_COLI_INFECTION | 0.0972696 | 0.8474092 | 0.1864326 | 0.7594684 | 1.356394 | 2 | TUBA8 | Not Significant |
# Generate a plot for KEGG pathway enrichment in Monotherapy (Figure 1B)
ggplot(fgsea_tidy1[fgsea_tidy1$pval < 0.05, ], aes(reorder(pathway, NES), NES, fill = Significance)) +
geom_col() +
scale_fill_manual(values = c("Significant" = "red", "Not Significant" = "blue")) +
coord_flip() +
theme_minimal() +
labs(x = "Pathway", y = "Normalized Enrichment Score (NES)",
title = "KEGG Pathway Enrichment - Monotherapy (Pre-treatment & EDT)") +
theme(axis.text.y = element_text(size = 10), axis.title.x = element_text(size = 12))
# Pathways mentioned in the paper
paper_pathways <- c(
"KEGG_STARCH_AND_SUCROSE_METABOLISM",
"KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS",
"KEGG_METABOLISM_OF_XENOBIOTICS_BY_CYTOCHROME_P450",
"KEGG_RETINOL_METABOLISM",
"KEGG_LINOLEIC_ACID_METABOLISM",
"KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY",
"KEGG_CHEMOKINE_SIGNALING_PATHWAY",
"KEGG_NATURAL_KILLER_CELL_MEDIATED_CYTOTOXICITY",
"KEGG_NEUROTROPHIN_SIGNALING_PATHWAY",
"KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION",
"KEGG_JAK_STAT_SIGNALING_PATHWAY"
)
# Pathways from fgsea_tidy1_top
fgsea_pathways <- fgsea_tidy1_top$pathway
# Find the intersection between the two
common_pathways <- intersect(paper_pathways, fgsea_pathways)
# Print the common pathways
print(paste("Common Pathways:", paste(common_pathways, collapse = ", ")))
## [1] "Common Pathways: KEGG_CHEMOKINE_SIGNALING_PATHWAY, KEGG_NATURAL_KILLER_CELL_MEDIATED_CYTOTOXICITY"